#!/usr/bin/env python
"""
Tool to perform standard bag-of-words classification on tweet text, using a range of classifiers

Sample usage:
python code/classify-bow.py data/dataset1/policy.csv -s code/stopwords.txt -o results/dataset1/policy-bow.csv
"""
from pathlib import Path
import random
from optparse import OptionParser
import logging as log
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import bow
from evaluation import ScoreCollection
from util import read_tweets

# --------------------------------------------------------------
# Evaluation Settings
cv_folds = 5

def seed_random(random_seed):
	log.info("Resetting random seed %s" % random_seed)
	random.seed(random_seed)
	np.random.seed(random_seed)

# --------------------------------------------------------------

def main():
	parser = OptionParser(usage="usage: %prog [options] data_path")
	parser.add_option("-o", action="store", type="string", dest="out_path", help="output path", default=None)
	parser.add_option("-s", action="store", type="string", dest="stoplist_file", help="custom stopword file path", default=None)
	parser.add_option("--seed", action="store", type="int", dest="seed", help="initial random seed", default=101)
	(options, args) = parser.parse_args()
	if len(args) != 1:
		parser.error( "Must specify data file" )	
	log.basicConfig(level=log.INFO, format='%(message)s')

	# read the data
	in_path = Path(args[0])
	log.info("Reading data from %s" % in_path)
	df = read_tweets(in_path)
	log.info("Read dataset with %d rows" % len(df))

	# output directory for results
	if options.out_path is None:
		out_path = Path("result.csv")
	else:
		out_path = options.out_path

	# Load stopwords, if specified
	stopwords = set()
	if not options.stoplist_file is None:
		log.info( "Using stopword list from %s" % options.stoplist_file )
		stopwords = bow.load_word_list(options.stoplist_file)
	log.info("Using %d stopwords" % len (stopwords))

	# Create the pipelines
	pipeline_knn = bow.create_pipeline(KNeighborsClassifier(n_neighbors=1), stopwords)
	pipeline_svm = bow.create_pipeline(SGDClassifier(), stopwords)
	pipeline_lr = bow.create_pipeline(LogisticRegression(max_iter=1000), stopwords)
	
	scores = ScoreCollection()

	# Apply CV + KNN
	log.info("- Applying KNN + CV")
	seed_random(options.seed)
	experiment = bow.apply_cv(df, pipeline_knn, cv_folds)
	log.info(experiment)
	scores.add("knn-cv", experiment)

	# Apply CV + SVM
	log.info("- Applying SVM + CV")
	seed_random(options.seed)
	experiment = bow.apply_cv(df, pipeline_svm, cv_folds)
	log.info(experiment)
	scores.add("svm-cv", experiment)

	# Apply CV + Logistic Regression
	log.info("- Applying Logistic Regression + CV")
	seed_random(options.seed)
	experiment = bow.apply_cv(df, pipeline_lr, cv_folds)
	log.info(experiment)
	scores.add("lr-cv", experiment)

	# # Apply GridCV + KNN
	log.info("- Applying KNN + GridCV")
	seed_random(options.seed)
	params = {"clf__n_neighbors":list(range(1, 21))}
	experiment = bow.apply_grid_cv(df, pipeline_knn, cv_folds, params)
	log.info(experiment)
	scores.add("knn-gridcv", experiment)

	# Apply GridCV + SVM
	log.info("- Applying SVM + GridCV")
	seed_random(options.seed)
	params = {'clf__l1_ratio': [.05, .1, .15, .2, .25, .3, .4, .5, .6, .7, .8, .9, .95, .99, 1],
              'clf__alpha': np.power(10, np.arange(-4, 1, dtype=float))}
	experiment = bow.apply_grid_cv(df, pipeline_svm, cv_folds, params)
	log.info(experiment)
	scores.add("svm-gridcv", experiment)

	# Apply GridCV + Logistic Regression
	log.info("- Applying Logistic Regression + GridCV")
	seed_random(options.seed)
	params = {"clf__C":np.logspace(-3,3,7)}
	experiment = bow.apply_grid_cv(df, pipeline_lr, cv_folds, params)
	log.info(experiment)
	scores.add("lr-gridcv", experiment)

	# display the results
	log.info(str(scores))
	# export the results
	log.info("Writing %s" % out_path)
	scores.save(out_path)

# --------------------------------------------------------------

if __name__ == "__main__":
	main()
